{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import yaml\n",
    "import requests\n",
    "import openai\n",
    "import textwrap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fmt(text):\n",
    "    wrapper = textwrap.TextWrapper(width=80, initial_indent=\"\", subsequent_indent=\"\")\n",
    "    return wrapper.fill(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import textwrap\n",
    "\n",
    "def wrap_markdown_text(text, width=120):\n",
    "    # Split the text into lines and wrap each line to no more than 80 columns,\n",
    "    # except for lines inside tables\n",
    "    lines = text.split(\"\\n\")\n",
    "    wrapped_lines = []\n",
    "    in_table = False\n",
    "    for line in lines:\n",
    "        # Check if the line starts or ends a table\n",
    "        if re.match(r\"^\\s*\\|\", line):\n",
    "            in_table = True\n",
    "        elif re.match(r\"^\\s*$\", line):\n",
    "            in_table = False\n",
    "\n",
    "        # Wrap the line if it's not inside a table\n",
    "        if not in_table:\n",
    "            wrapped_line = textwrap.fill(line, width=width)\n",
    "        else:\n",
    "            wrapped_line = line\n",
    "\n",
    "        wrapped_lines.append(wrapped_line)\n",
    "\n",
    "    # Join the wrapped lines back into a single string\n",
    "    wrapped_text = \"\\n\".join(wrapped_lines)\n",
    "\n",
    "    return wrapped_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import textwrap\n",
    "\n",
    "def wrap_markdown_text(text):\n",
    "    # Split the text into lines and wrap each line to no more than 80 columns,\n",
    "    # except for lines inside tables, code blocks, and triple-quoted text\n",
    "    lines = text.split(\"\\n\")\n",
    "    wrapped_lines = []\n",
    "    in_table = False\n",
    "    in_code_block = False\n",
    "    in_triple_quotes = False\n",
    "    for line in lines:\n",
    "        # Check if the line starts or ends a table\n",
    "        if re.match(r\"^\\s*\\|\", line):\n",
    "            in_table = True\n",
    "        elif re.match(r\"^\\s*$\", line):\n",
    "            in_table = False\n",
    "\n",
    "        # Check if the line is inside a code block\n",
    "        if re.match(r\"^```\", line):\n",
    "            in_code_block = not in_code_block\n",
    "\n",
    "        # Check if the line is inside triple quotes\n",
    "        if re.match(r'^\\s*\"\"\"', line):\n",
    "            in_triple_quotes = not in_triple_quotes\n",
    "\n",
    "        # Wrap the line if it's not inside a table, code block, or triple-quoted text\n",
    "        if not in_table and not in_code_block and not in_triple_quotes:\n",
    "            wrapped_line = textwrap.fill(line, width=80)\n",
    "        else:\n",
    "            wrapped_line = line\n",
    "\n",
    "        wrapped_lines.append(wrapped_line)\n",
    "\n",
    "    # Join the wrapped lines back into a single string\n",
    "    wrapped_text = \"\\n\".join(wrapped_lines)\n",
    "\n",
    "    return wrapped_text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_gh_token():\n",
    "    config_path = os.path.join(os.path.expanduser(\"~\"), \".config\", \"gh\", \"hosts.yml\")\n",
    "\n",
    "    with open(config_path, \"r\") as f:\n",
    "        config = yaml.safe_load(f)\n",
    "\n",
    "    if \"github.com\" in config and \"oauth_token\" in config[\"github.com\"]:\n",
    "        return config[\"github.com\"][\"oauth_token\"]\n",
    "\n",
    "    raise Exception(\"No GitHub token found in config file\")\n",
    "\n",
    "token = get_gh_token()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Clarification on what `to_tuple()` does?\n",
      "\n",
      "Sorry I'm confused -- It's unclear from the documentation (and my reading of\n",
      "[the code](https://github.com/webdataset/webdataset/blob/cb1aa32aca3f5fa3f214c38\n",
      "a2145b14cd28629cc/webdataset/compat.py#L66) ) what the purpose of `.to_tuple()`\n",
      "is.\n",
      "\n",
      "It is used throughout the documentation (e.g., [\"Getting\n",
      "Started\"](https://webdataset.github.io/webdataset/gettingstarted/),  [\"How it\n",
      "Works\"](https://webdataset.github.io/webdataset/howitworks/#how-it-works)) and\n",
      "[README](https://github.com/webdataset/webdataset#readme) yet never described\n",
      "what it actually is or does.  It seems to be absolutely crucial since it appears\n",
      "in every example, but... what is it?   And how are we to learn which arguments\n",
      "to use with it?\n",
      "\n",
      "For example, what is the difference between `.to_tuple(\"png\", \"json\")` and\n",
      "`.to_tuple(\"png;jpg;jpeg\", \"json\")`?  Why is \"json\" the last argument?\n",
      "\n",
      "Does this mean that it's going to produce JSON text for the Python tuple\n",
      "`(\"png\", \"jpg\", \"jpeg\")`?  Why would we want that -- is it that hard to simply\n",
      "write` '(\"png\", \"jpg\", \"jpeg\")'`?\n",
      "\n",
      "\n",
      "## Comment by tmbdev on 2023-03-03T19:58:11Z:\n",
      "\n",
      "The `to_tuple` method extracts fields from a dictionary based on extension.\n",
      "\n",
      "For example, for image classification training, you may have classes in .cls\n",
      "files and images in .png, .jpg, or .jpeg files. You would then write:\n",
      "\n",
      "```\n",
      "ds = WebDataset(...) ... .to_tuple(\"png;jpg;jpeg\", \"cls\")\n",
      "dl = DataLoader(ds, ...)\n",
      "for image, cls in dl:\n",
      "    train(image, cls)\n",
      "```\n",
      "\n",
      "I'll try to improve the documentation.\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "def get_documentation_issues_and_comments(owner, repo, token):\n",
    "    # Retrieve all issues for the repository\n",
    "    issues_url = f\"https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=100&labels=documentation\"\n",
    "    issues_response = requests.get(issues_url, headers={\"Authorization\": f\"Bearer {token}\"})\n",
    "    issues = issues_response.json()\n",
    "\n",
    "    # Retrieve all comments for each issue and concatenate them onto the issue string\n",
    "    for issue in issues:\n",
    "        issue_number = issue[\"number\"]\n",
    "        comments_url = f\"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/comments?per_page=100\"\n",
    "        comments_response = requests.get(comments_url, headers={\"Authorization\": f\"Bearer {token}\"})\n",
    "        comments = comments_response.json()\n",
    "\n",
    "        # Concatenate issue and comments as text\n",
    "        text = f\"# {issue['title']}\\n\\n{issue['body']}\\n\\n\"\n",
    "        for comment in comments:\n",
    "            text += f\"## Comment by {comment['user']['login']} on {comment['created_at']}:\\n\\n{comment['body']}\\n\\n\"\n",
    "        \n",
    "        # Yield the issue text with all comments concatenated\n",
    "        yield issue_number, text\n",
    "\n",
    "issues = get_documentation_issues_and_comments(\"webdataset\", \"webdataset\", token)\n",
    "print(wrap_markdown_text(next(iter(issues))[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
    "openai.api_key = open(os.path.expanduser(\"~/.ssh/openai-key\")).read().strip()\n",
    "\n",
    "model_id = 'gpt-3.5-turbo'\n",
    "\n",
    "# model_id = 'gpt-4.0'\n",
    "\n",
    "initial_prompt = \"\"\"\n",
    "Here is a discussion of an issue from the WebDataset issue tracker.\n",
    "\n",
    "Please summarize this issue in the form of an FAQ entry.\n",
    "\n",
    "Summarize the question in a single sentence and precede it with \"Q: \".\n",
    "\n",
    "Then leave a blank line and start your answer with \"A: \". Provide a single paragraph for an answer. You may include a code example.\n",
    "\n",
    "Use markdown formatting to make the answer more readable.\n",
    "\"\"\"\n",
    "\n",
    "initial_context = [\n",
    "    {'role': 'system', 'content': initial_prompt}\n",
    "]\n",
    "\n",
    "def chatgpt(prompt, conversation, role='user', model_id=model_id):\n",
    "    if conversation is None:\n",
    "        conversation = initial_context.copy()\n",
    "    conversation.append({'role': role, 'content': prompt})\n",
    "    try:\n",
    "        response = openai.ChatCompletion.create(\n",
    "            model=model_id,\n",
    "            messages=conversation\n",
    "        )\n",
    "    except openai.error.InvalidRequestError as e:\n",
    "        msg = f\"Error: {str(e)[:80]}...\"\n",
    "        return msg, conversation\n",
    "    # api_usage = response['usage']\n",
    "    # print('Total token consumed: {0}'.format(api_usage['total_tokens']))\n",
    "    # stop means complete\n",
    "    # print(response['choices'][0].finish_reason)\n",
    "    # print(response['choices'][0].index)\n",
    "    rrole, response = response.choices[0].message.role, response.choices[0].message.content\n",
    "    conversation.append({'role': rrole, 'content': response})\n",
    "    return response, conversation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Q: Why do periods in the base part of the filename cause issues in WebDataset\n",
      "and how can it be resolved?\n",
      "\n",
      "A: Periods in the base part of the filename cause issues in WebDataset as the\n",
      "periods are used to support multiple extensions like \".seg.jpg\". It is\n",
      "recommended to deal with this during dataset creation by avoiding such\n",
      "conventions. Also, in several places, you can use \"glob\" patterns like \"*.mp3\"\n",
      "to match extensions. Mapping the filenames between the tariterator and the\n",
      "tarfile_to_samples stages in the pipeline is possible, but not recommended.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "issue_number, issue_text = next(iter(issues))\n",
    "response, _ = chatgpt(issue_text, None)\n",
    "print(wrap_markdown_text(response))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "79 \"# Guidance on using Webdataset for small embeddings\\n\\nI'm trying to use Webdatase\"\n",
      "73 '# Adding option to shuffle shards before splitting to workers, based on current '\n",
      "71 '# Unexpected Shuffling Behavior\\n\\nEven with buffer size and initial are set to th'\n",
      "68 '# Program stop at some iteration\\n\\nHi @tmbdev Thanks for sharing the excellent li'\n",
      "66 '# Pytorch Lightning integration\\n\\nHi\\r\\n\\r\\nCurrently, webdataset dataset using the d'\n",
      "49 '# Path separator not decoded correctly\\n\\nI tried to open a local tar file in Wind'\n",
      "36 '# Using webdataset with torch-xla in multiprocessing context\\n\\nI actually wrote a'\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "from itertools import islice\n",
    "for issue_number, issue_text in issues:\n",
    "    print(issue_number, repr(issue_text[:80]), file=sys.stderr)\n",
    "    ofile = \"faq/issue-%04d.md\" % issue_number\n",
    "    if os.path.exists(ofile):\n",
    "        continue\n",
    "    response, _ = chatgpt(issue_text, None)\n",
    "    with open(ofile, \"w\") as f:\n",
    "        f.write(wrap_markdown_text(response))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "\n",
    "result = \"\"\n",
    "for fname in sorted(glob.glob(\"faq/issue-*.md\")):\n",
    "    entry = open(fname).read()\n",
    "    result += entry + \"\\n\\n---\\n\\n\"\n",
    "    \n",
    "with open(\"FAQ.md\", \"w\") as f:\n",
    "    f.write(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
